R Markdown

Team Members:

##Rashmi Raju Koparde -230322 ##Sathya Sudha Murugan-229638 ##Shweta Bhat - 229530

Loading libraries.

library(readxl)
library(data.table)
library(plyr)
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: scales
## Loading required package: grid
library(ggfortify)
## 
## Attaching package: 'ggfortify'
## The following object is masked from 'package:ggbiplot':
## 
##     ggbiplot
library(embed)
## Loading required package: recipes
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(tidytext)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(forcats)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ purrr   0.3.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x plotly::arrange()   masks dplyr::arrange(), plyr::arrange()
## x dplyr::between()    masks data.table::between()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact()    masks plyr::compact()
## x dplyr::count()      masks plyr::count()
## x purrr::discard()    masks scales::discard()
## x dplyr::failwith()   masks plyr::failwith()
## x plotly::filter()    masks dplyr::filter(), stats::filter()
## x dplyr::first()      masks data.table::first()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::id()         masks plyr::id()
## x dplyr::lag()        masks stats::lag()
## x dplyr::last()       masks data.table::last()
## x plotly::mutate()    masks dplyr::mutate(), plyr::mutate()
## x plotly::rename()    masks dplyr::rename(), plyr::rename()
## x plotly::summarise() masks dplyr::summarise(), plyr::summarise()
## x dplyr::summarize()  masks plyr::summarize()
## x purrr::transpose()  masks data.table::transpose()
library(ggplot2)
library(Rtsne)
library(umap)

##Loading data.

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   pathology = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

##Applying countour on PCA

df <- dataset
df=dataset[,c(1,2,4:ncol(dataset))]

  
pca_gucci <- recipe(~., data = dataset) %>%
  update_role(pathology, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_gucci)

tidied_pca <- tidy(pca_prep, 2)

x=tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  group_by(component) %>%
  mutate(component = fct_inorder(component))
x=pivot_wider(tidied_pca,names_from=component,values_from = value)
x$terms=NULL
x$id=NULL


fig <- plot_ly(x, x = ~PC1, y = ~PC2, z = ~PC3, type = "contour",
               width = 600, height = 500,contours = list(showlabels = TRUE),color = I("black"))
fig %>% colorbar(title = "PC3") 
## Warning: Didn't find a colorbar to modify.

##Applying scatter plot on PCA.

juice(pca_prep) %>%
  ggplot(aes(PC1, PC2, PC3,fill=pathology)) +
  geom_point(aes(color = pathology), alpha = 0.7, size = 2) + 
  #stat_ellipse(geom="polygon",col="black",alpha=0.1)+
  scale_color_manual(values=c("#31a354","#2b8cbe", "#fc9272"))

##Applying U-MAP on dataset and ScatterPlot.

umap_rec <- recipe(~., data = dataset) %>%
  update_role(pathology, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

umap_prep <- prep(umap_rec)
juice(umap_prep)%>%
  ggplot(aes(umap_1, umap_2,fill=pathology)) +
  geom_point(aes(color = pathology), alpha = 0.7, size = 2) +
  #stat_ellipse(geom="polygon",col="black",alpha=0.1)+
  scale_color_manual(values=c("#31a354","#2b8cbe", "#fc9272"))

##Applying Contour on U-Map.

data.umap = umap(df, n_components=3)
dims_umap <- data.umap$layout
colnames(dims_umap) <- c("UMAP1", "UMAP2","UMAP3")

df_out = as.data.frame(dims_umap)


fig <- plot_ly(df_out, x = df_out$UMAP1, y = df_out$UMAP2, z = df_out$UMAP3, type = "contour", width = 600, height = 500,contours = list(showlabels = TRUE),color = I("black"))
fig %>% colorbar(title = "UMAP")
## Warning: Didn't find a colorbar to modify.

##Applying t-SNE on dataset and Contour on T-SNE.

dat.active <- df
tsne_out <- Rtsne(dat.active, dims = 3, perplexity=30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 122 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.02 seconds (sparsity = 0.875840)!
## Learning embedding...
## Iteration 50: error is 55.652561 (50 iterations in 0.05 seconds)
## Iteration 100: error is 56.284607 (50 iterations in 0.03 seconds)
## Iteration 150: error is 58.561825 (50 iterations in 0.04 seconds)
## Iteration 200: error is 56.004091 (50 iterations in 0.03 seconds)
## Iteration 250: error is 57.035669 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.018256 (50 iterations in 0.02 seconds)
## Iteration 350: error is 0.527153 (50 iterations in 0.03 seconds)
## Iteration 400: error is 0.309658 (50 iterations in 0.02 seconds)
## Iteration 450: error is 0.268569 (50 iterations in 0.02 seconds)
## Iteration 500: error is 0.257745 (50 iterations in 0.02 seconds)
## Fitting performed in 0.29 seconds.
t=data.frame(tsne1=tsne_out$Y[,1],tsne2=tsne_out$Y[,2],tsne3=tsne_out$Y[,3])#,pathology=dataset$pathology)

fig <- plot_ly(t, x = t$tsne1, y = t$tsne2, z = t$tsne3, type = "contour", width = 600, height = 500,contours = list(showlabels = TRUE),color = I("black"))
fig %>% colorbar(title = "Tsne3")
## Warning: Didn't find a colorbar to modify.

##Scatter Plot on tsne.

t=data.frame(tsne1=tsne_out$Y[,1],tsne2=tsne_out$Y[,2],tsne3=tsne_out$Y[,3],pathology=dataset$pathology)

t%>%ggplot(aes(tsne1, tsne2,fill=pathology)) +
  geom_point(aes(color = pathology), alpha = 0.7, size = 2) +
  #stat_ellipse(geom="polygon",col="black",alpha=0.1)+
  scale_color_manual(values=c("#31a354","#2b8cbe", "#fc9272"))

##hyper Prarmetre tuning.

set.seed(1) # for reproducibility
perpl=30
iterations=500
learning=200

perpl <- c(30,35)
i=1
  tsne_out <- Rtsne(df, dims = 3, perplexity=perpl[i], verbose=TRUE, max_iter=iterations, eta=learning)
## Performing PCA
## Read the 122 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.02 seconds (sparsity = 0.875840)!
## Learning embedding...
## Iteration 50: error is 51.737993 (50 iterations in 0.03 seconds)
## Iteration 100: error is 55.399837 (50 iterations in 0.04 seconds)
## Iteration 150: error is 53.816672 (50 iterations in 0.03 seconds)
## Iteration 200: error is 55.667034 (50 iterations in 0.03 seconds)
## Iteration 250: error is 54.217633 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.014411 (50 iterations in 0.02 seconds)
## Iteration 350: error is 0.450896 (50 iterations in 0.02 seconds)
## Iteration 400: error is 0.348436 (50 iterations in 0.02 seconds)
## Iteration 450: error is 0.282917 (50 iterations in 0.02 seconds)
## Iteration 500: error is 0.264900 (50 iterations in 0.02 seconds)
## Fitting performed in 0.28 seconds.
  t=data.frame(tsne1=tsne_out$Y[,1],tsne2=tsne_out$Y[,2],tsne3=tsne_out$Y[,3])#,pathology=dataset$pathology)
  fig <- plot_ly(t, x = t$tsne1, y = t$tsne2, z = t$tsne3, type = "contour", width = 600, height = 500,contours = list(showlabels = TRUE),color = I("black"))
  fig %>% colorbar(title = "Tsne3")
## Warning: Didn't find a colorbar to modify.
i=i+1

  tsne_out <- Rtsne(df, dims = 3, perplexity=perpl[i], verbose=TRUE, max_iter=iterations, eta=learning)
## Performing PCA
## Read the 122 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 35.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.02 seconds (sparsity = 0.961973)!
## Learning embedding...
## Iteration 50: error is 51.148279 (50 iterations in 0.03 seconds)
## Iteration 100: error is 54.401081 (50 iterations in 0.04 seconds)
## Iteration 150: error is 56.144805 (50 iterations in 0.06 seconds)
## Iteration 200: error is 55.124023 (50 iterations in 0.07 seconds)
## Iteration 250: error is 49.930915 (50 iterations in 0.07 seconds)
## Iteration 300: error is 1.135920 (50 iterations in 0.03 seconds)
## Iteration 350: error is 0.522236 (50 iterations in 0.02 seconds)
## Iteration 400: error is 0.299130 (50 iterations in 0.02 seconds)
## Iteration 450: error is 0.245188 (50 iterations in 0.02 seconds)
## Iteration 500: error is 0.227364 (50 iterations in 0.02 seconds)
## Fitting performed in 0.39 seconds.
  t=data.frame(tsne1=tsne_out$Y[,1],tsne2=tsne_out$Y[,2],tsne3=tsne_out$Y[,3])#,pathology=dataset$pathology)
  fig <- plot_ly(t, x = t$tsne1, y = t$tsne2, z = t$tsne3, type = "contour", width = 600, height = 500,contours = list(showlabels = TRUE),color = I("black"))
  fig %>% colorbar(title = "Tsne3")
## Warning: Didn't find a colorbar to modify.

##References:

#https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html
#https://juliasilge.com/blog/cocktail-recipes-umap/
#https://jkzorz.github.io/2020/02/29/contour-plots.html
#https://www.r-statistics.com/2016/07/using-2d-contour-plots-within-ggplot2-to-visualize-relationships-between-three-variables/